In [1]:
import os
import math
import numpy as np
from ctypes import CDLL, POINTER,  c_int32, c_int16, c_bool
from IPython.display import display, Audio, Markdown
from scipy.signal import spectrogram
import matplotlib.pyplot as plt

from pydub import AudioSegment

# Init C interface

current_dir = os.path.dirname(os.path.realpath(os.path.abspath('')))
so_file = os.path.normpath(current_dir + '/build/libdsp.so')

c_interface = CDLL(so_file)

c_interface.init_audio_accumulator.argtypes = (
    c_int32,
    c_int32,
    c_int32,
    c_int32,
    c_int32,
)
c_interface.init_audio_accumulator.restype = None

c_interface.accumulate_input.argtypes = [
    np.ctypeslib.ndpointer(dtype=np.int16, ndim=1, flags='CONTIGUOUS')
]
c_interface.accumulate_input.restype = None

c_interface.extract_output.argtypes = []
c_interface.extract_output.restype = POINTER(c_int16)

c_interface.is_output_ready.argtypes = []
c_interface.is_output_ready.restype = c_bool

# Init audio accumulator

NUM_INPUT_FRAMES = 1024
NUM_PROCESSING_FRAMES = (NUM_INPUT_FRAMES * 2)
NUM_OUTPUT_FRAMES = 2048
NUM_DELAY_FRAMES = 2048
SAMPLE_RATE = 22050
OSAMP = 2
NUM_CHANNELS = 2
SAMPLES_PER_SECOND = SAMPLE_RATE * NUM_CHANNELS

c_interface.init_audio_accumulator(
    NUM_INPUT_FRAMES,
    NUM_OUTPUT_FRAMES,
    NUM_PROCESSING_FRAMES,
    OSAMP,
    SAMPLE_RATE
)

# Display audio functions

def display_spectrogram(audio):
    f, t, Sxx = spectrogram(audio, fs=SAMPLE_RATE, window='hann', nperseg=1024)
    plt.pcolormesh(t, f, 10 * np.log10(Sxx + 1e-10), shading='gouraud')
    plt.ylabel('Frequency [Hz]')
    plt.xlabel('Time [sec]')
    plt.colorbar(label='dB')
    plt.show()

def display_audio(name, audio):
    display(
        Markdown('### ' + name),
        Audio(data=(audio, audio), rate=SAMPLE_RATE)
    )
    display_spectrogram(audio)

# Audio processing function

def process_audio(audio_file_name):
    # Load audio (and interleave it like a real world audio signal)
    file_path = "audio/" + audio_file_name + ".mp3"
    audio = AudioSegment.from_mp3(file_path).set_frame_rate(SAMPLE_RATE)
    input_audio = np.array(audio.get_array_of_samples())
    start_sample = SAMPLES_PER_SECOND * 30
    end_sample = SAMPLES_PER_SECOND * 60
    input_audio = input_audio[start_sample:end_sample]

    # Process each window of audio
    output_audio = np.empty((0,), dtype=np.int16)
    num_channels = 2
    samples_per_input_buffer = NUM_INPUT_FRAMES * num_channels
    samples_per_output_buffer = NUM_OUTPUT_FRAMES * num_channels
    n_windows = math.floor(len(input_audio) / samples_per_input_buffer)
    for i in range(n_windows):
        start = i * samples_per_input_buffer
        end = start + samples_per_input_buffer
        window = input_audio[start:end]
        c_interface.accumulate_input(window)
        is_output_ready = c_interface.is_output_ready()
        if is_output_ready:
            output_audio_buffer = c_interface.extract_output()
            output_audio_nparray = np.ctypeslib.as_array(output_audio_buffer, shape=(samples_per_output_buffer,))
            output_audio = np.concatenate([output_audio, output_audio_nparray])

    # Deinterleave audio
    input_l = input_audio[::2]
    input_r = input_audio[1::2]
    output_l = output_audio[::2]
    output_r = output_audio[1::2]

    # Display output
    input_mono = (input_l.astype(np.float32) + input_r.astype(np.float32)) / 2
    display_audio("Input", input_mono)
    display_audio("Output (l)", output_l)
    display_audio("Output (r)", output_r)

# Process several audio tracks
audio_file_names = [
    'Bad Bunny - NUEVAYoL',
    'Miley Cyrus - We Can\'t Stop',
    'SZA - BMF',
]
for audio_file_name in audio_file_names:
    display(
        Markdown('## ' + audio_file_name)
    )
    process_audio(audio_file_name)

Bad Bunny - NUEVAYoL¶

Input¶

Your browser does not support the audio element.

Output (l)¶

Your browser does not support the audio element.

Output (r)¶

Your browser does not support the audio element.

Miley Cyrus - We Can't Stop¶

Input¶

Your browser does not support the audio element.

Output (l)¶

Your browser does not support the audio element.

Output (r)¶

Your browser does not support the audio element.

SZA - BMF¶

Input¶

Your browser does not support the audio element.

Output (l)¶

Your browser does not support the audio element.

Output (r)¶

Your browser does not support the audio element.